Urllib库¶
内置的HTTP请求库¶
- 请求模块: urllib.request
- 异常处理模块: urllib.error
- url解析模块:urllib.parse
- robots.txt解析模块:urllib.robotparser
- 代理: ProxyHandler (翻墙)
- Cookie: http.cookiejar
- 网址拼接: urljoin
- 将字典转换成请求参数: urlencode
urllib¶
urlopen¶
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
1 2 3 4 | import urllib.request #请求模块 response = urllib.request.urlopen('http://www.baidu.com') #打开这个url地址,get请求 print(response.read().decode('utf-8')) #读取源代码,这里解码用utf-8 |
1 | <!DOCTYPE html><!--STATUS OK--> |
1 | </html> |
1 2 3 4 5 6 | import urllib.parse #url解析模块 import urllib.request data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8') #转换成二进制,这里传递过去word以及hello response = urllib.request.urlopen('http://httpbin.org/post', data=data)#加上data是一种post请求 print(response.read()) |
1 | b'{\n "args": {}, \n "data": "", \n "files": {}, \n "form": {\n "word": "hello"\n }, \n "headers": {\n "Accept-Encoding": "identity", \n "Content-Length": "10", \n "Content-Type": "application/x-www-form-urlencoded", \n "Host": "httpbin.org", \n "User-Agent": "Python-urllib/3.6", \n "X-Amzn-Trace-Id": "Root=1-5f0d3ca2-bb53229b018e17799e02b1ae"\n }, \n "json": null, \n "origin": "183.207.182.162", \n "url": "http://httpbin.org/post"\n}\n'
|
1 2 3 4 | import urllib.request response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) #timeout超时响应时间 print(response.read()) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | ---------------------------------------------------------------------------
timeout Traceback (most recent call last)
<ipython-input-9-624debaefd14> in <module>
1 import urllib.request
2
----> 3 response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) #timeout超时响应时间
4 print(response.read())
D:\Anaconda3\envs\CPU\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
D:\Anaconda3\envs\CPU\lib\urllib\request.py in open(self, fullurl, data, timeout)
524 req = meth(req)
525
--> 526 response = self._open(req, data)
527
528 # post-process response
D:\Anaconda3\envs\CPU\lib\urllib\request.py in _open(self, req, data)
542 protocol = req.type
543 result = self._call_chain(self.handle_open, protocol, protocol +
--> 544 '_open', req)
545 if result:
546 return result
D:\Anaconda3\envs\CPU\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
D:\Anaconda3\envs\CPU\lib\urllib\request.py in http_open(self, req)
1344
1345 def http_open(self, req):
-> 1346 return self.do_open(http.client.HTTPConnection, req)
1347
1348 http_request = AbstractHTTPHandler.do_request_
D:\Anaconda3\envs\CPU\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
1319 except OSError as err: # timeout error
1320 raise URLError(err)
-> 1321 r = h.getresponse()
1322 except:
1323 h.close()
D:\Anaconda3\envs\CPU\lib\http\client.py in getresponse(self)
1352 try:
1353 try:
-> 1354 response.begin()
1355 except ConnectionError:
1356 self.close()
D:\Anaconda3\envs\CPU\lib\http\client.py in begin(self)
305 # read until we get a non-100 response
306 while True:
--> 307 version, status, reason = self._read_status()
308 if status != CONTINUE:
309 break
D:\Anaconda3\envs\CPU\lib\http\client.py in _read_status(self)
266
267 def _read_status(self):
--> 268 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
269 if len(line) > _MAXLINE:
270 raise LineTooLong("status line")
D:\Anaconda3\envs\CPU\lib\socket.py in readinto(self, b)
584 while True:
585 try:
--> 586 return self._sock.recv_into(b)
587 except timeout:
588 self._timeout_occurred = True
timeout: timed out
|
1 2 3 4 5 6 7 8 9 | import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1) except urllib.error.URLError as e: #捕获异常 if isinstance(e.reason, socket.timeout): print('TIME OUT') |
1 | TIME OUT |
响应¶
响应类型¶
1 2 3 4 | import urllib.request response = urllib.request.urlopen('https://www.python.org') print(type(response)) #响应的类型 |
1 | <class 'http.client.HTTPResponse'> |
状态码、响应头¶
1 2 3 4 5 6 | import urllib.request response = urllib.request.urlopen('https://www.python.org') print(response.status) #状态码 print(response.getheaders()) #响应头 print(response.getheader('Server')) #使用的服务器的类型 |
1 2 3 | 200
[('Connection', 'close'), ('Content-Length', '48997'), ('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'DENY'), ('Via', '1.1 vegur'), ('Via', '1.1 varnish'), ('Accept-Ranges', 'bytes'), ('Date', 'Tue, 14 Jul 2020 05:03:52 GMT'), ('Via', '1.1 varnish'), ('Age', '3128'), ('X-Served-By', 'cache-bwi5126-BWI, cache-hkg17920-HKG'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '42, 1733'), ('X-Timer', 'S1594703032.137189,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]
nginx
|
1 2 3 4 | import urllib.request response = urllib.request.urlopen('https://www.python.org') print(response.read().decode('utf-8')) #response.read()获取响应体的内容 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | <!doctype html>
<!--[if lt IE 7]> <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9"> <![endif]-->
<!--[if IE 7]> <html class="no-js ie7 lt-ie8 lt-ie9"> <![endif]-->
<!--[if IE 8]> <html class="no-js ie8 lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">
<meta name="application-name" content="Python.org">
<meta name="msapplication-tooltip" content="The official home of the Python Programming Language">
<meta name="apple-mobile-web-app-title" content="Python.org">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="HandheldFriendly" content="True">
<meta name="format-detection" content="telephone=no">
<meta http-equiv="cleartype" content="on">
<meta http-equiv="imagetoolbar" content="false">
<script src="/static/js/libs/modernizr.js"></script>
<link href="/static/stylesheets/style.30afed881237.css" rel="stylesheet" type="text/css" title="default" />
<link href="/static/stylesheets/mq.eef77a5d2257.css" rel="stylesheet" type="text/css" media="not print, braille, embossed, speech, tty" />
|
<!--[if (lte IE 8)&(!IEMobile)]>
<![endif]→
1 2 3 4 | <title>Welcome to Python.org</title>
<meta name="description" content="The official home of the Python Programming Language">
<meta name="keywords" content="Python programming language object oriented web free open source software license documentation download community">
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | <meta property="og:image" content="https://www.python.org/static/opengraph-icon-200x200.png">
<meta property="og:image:secure_url" content="https://www.python.org/static/opengraph-icon-200x200.png">
<meta property="og:url" content="https://www.python.org/">
<link rel="author" href="/static/humans.txt">
<link rel="alternate" type="application/rss+xml" title="Python Enhancement Proposals"
href="https://www.python.org/dev/peps/peps.rss/">
<link rel="alternate" type="application/rss+xml" title="Python Job Opportunities"
href="https://www.python.org/jobs/feed/rss/">
<link rel="alternate" type="application/rss+xml" title="Python Software Foundation News"
href="https://feeds.feedburner.com/PythonSoftwareFoundationNews">
<link rel="alternate" type="application/rss+xml" title="Python Insider"
href="https://feeds.feedburner.com/PythonInsider">
|
1 2 3 4 | <script src="/static/js/libs/masonry.pkgd.min.js"></script>
<script src="/static/js/libs/html-includes.js"></script>
<script type="text/javascript" src="/static/js/main-min.a3326162e3f0.js" charset="utf-8"></script>
|
<!--[if lte IE 7]>
<![endif]→
1 2 | <!--[if lte IE 8]>
<script type="text/javascript" src="/static/js/plugins/getComputedStyle-min.c3860be1d290.js" charset="utf-8"></script>
|
<![endif]→